from __future__ import with_statement
import re
import os, os.path

## Note that the PESC documents change their format.  The original version of this file
## worked just fine for the years 94-04.  It starts having problems at 05.  From then on
## the briefs have a lot of html formatting in them


def filepaths(top_path):
    for dirpath, subdirs, files in os.walk(top_path):
        for f in files:
            yield f, os.path.join(dirpath, f)

def fixfilename(filename):
    # note: filename comes in as a string
    first2 = filename[0:2]
    if int(first2) > 50:
        first4 = "19" + first2
    else:
        first4 = "20" + first2
    return first4 + filename[2:]

def levdist(first, second):
    # Find the Levenshtein distance between two strings.
    if len(first) > len(second):
        first, second = second, first
    if len(second) == 0:
        return len(first)
    first_length = len(first) + 1
    second_length = len(second) + 1
    distance_matrix = [[0] * second_length for x in range(first_length)]
    for i in range(first_length):
       distance_matrix[i][0] = i
    for j in range(second_length):
       distance_matrix[0][j]=j
    for i in xrange(1, first_length):
        for j in range(1, second_length):
            deletion = distance_matrix[i-1][j] + 1
            insertion = distance_matrix[i][j-1] + 1
            substitution = distance_matrix[i-1][j-1]
            if first[i-1] != second[j-1]:
                substitution += 1
            distance_matrix[i][j] = min(insertion, deletion, substitution)
    return distance_matrix[first_length-1][second_length-1]

def mk4dig(num):
    addzero = "000"+str(num)
    numlen = len(addzero)
    newnum = addzero[numlen-4:numlen]
    return newnum

def mk2dig(num):
    addzero = "000"+str(num)
    numlen = len(addzero)
    newnum = addzero[numlen-2:numlen]
    return newnum

# Attempt 1 at a general function to fix the date
def fixtheirdate(theirdate, fname):
    tochop = re.sub('  ', ' ', theirdate)
    try:
        briefday = tochop.split(' ',1)[1].split(' ',1)[0]
        briefmon = months[tochop.split(' ',2)[2].split(' ',1)[0]]
        briefyear = tochop.split(' ',3)[3].split(' ',1)[0]
        briefdate = str(briefyear) + mk2dig(str(briefmon)) + mk2dig(str(briefday))
        return briefdate
    except:
        print("Breaking down on " + fname)
        return("00000000")

# Attempt 2 at a general function...
def fixdate(theirdate, fname):
    ch1 = re.sub(' ', '', theirdate)
    ch2 = re.sub(',', '', ch1)
    ch3 = re.sub('\<.*?>', ' ', ch2)
    #ch3 = re.sub('.', '', ch2)
    # ch2 is the final thing that's been cut out
    myd = ch3
    
    #print("theirdate is "+theirdate+" and "+ch2)
    try:
        briefday = myd[0:2]
        briefmon = months[str(myd[2:len(myd)-4])]
        #print("getting here!")
        briefyear = myd[len(myd)-2:len(myd)]

        briefdate = str(briefyear) + mk2dig(str(briefmon)) + mk2dig(str(briefday))
        return briefdate
    except:
        print("Breaking down on " + fname)
        return("00000000")
    

################################################
############# BEGIN SCRIPT #####################
################################################

#oldpath = 'C:\Documents and Settings\John Sheffield\Desktop\Simmons RA\EuroData'
#newpath = 'C:\Documents and Settings\John Sheffield\Desktop\Simmons RA\EuroBriefs'
#infopath = 'C:\Documents and Settings\John Sheffield\Desktop\Simmons RA'

oldpath = 'C:\\Documents and Settings\Rich\Desktop\Rewards for Ratification\\text analysis\EuroData14jul2010'
newpath = 'C:\\Documents and Settings\Rich\Desktop\Rewards for Ratification\\text analysis\EuroBriefs14jul2010'
infopath = 'C:\\Documents and Settings\Rich\Desktop\Rewards for Ratification\\text analysis'

months = {
    'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6,
    'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12,
    'january':1, 'february':2, 'march':3, 'april':4, 'may':5, 'june':6,
    'july':7, 'august':8, 'september':9, 'october':10, 'november':11, 'december':12
}

locations = filepaths(oldpath)
fkeys = []

for name, path in locations:
    with open(path) as f:
        brief = f.read()
        
    # Get the new filenames/paths straight
    newfn = fixfilename(name)
    newfp = os.path.join(newpath, newfn)

    # Start slicing briefs
    try:
         #briefdoc = brief.split('<H3>', 1)[1].split('</div>', 1)[0]
         ## This line works to pull ou the main text in most briefs.
         #briefdoc = brief.split('div class="pressReleaseContentMain"', 1)[1].split('</div>', 1)[0]
         ## this is trying to fix the problem that starts in 2005
         briefdoc = brief.split('div class="pressReleaseContentMain">', 1)[1].split('</body>', 1)[0]
         #briefdline = brief.split('</B>\n<P>', 1)[1].split('\n', 1)[0]
         #briefdline = brief.split('</b></p>', 1)[1].split('</p>', 1)[0]
         ## ADD A LINE HERE: need ot pull out the dates
         briefdline = brief.split('Date:&nbsp;', 1)[1].split('</td></tr>', 1)[0]
         ## pull out the brief title and description
         brieftitle = brief.split('<meta name="Title" content="',1)[1].split('" />',1)[0]
         briefdesc = brief.split('<meta name="Description" content="',1)[1].split('" />',1)[0]
    except:
        #print "no '<H3>' in", path
        print "problem with brief or date in", path
        continue

    # Drop extra HTML junk. WHAT SHOULD I DO WITH LINK ANCHORS?
    brieftext = re.sub('\<.*?>', ' ', briefdoc)
    # I added this line
    briefdate = re.sub('\\t','', briefdline)
    briefdate = re.sub('\\n','', briefdate)
    briefdate = re.sub('\\r','', briefdate)

    brieftext = "DATE = " + briefdate + "\n\n  TITLE: " + brieftitle +"\n\n  DESCRIPTION: " + briefdesc + "\n\n  BRIEF: " + brieftext
    
    # Print the sliced brief with date at header spot
    newfile = open(newfp, 'w')
    newfile.write(brieftext)
    newfile.close()

    

    
    
        
    
